Alignment summary

CT1 = 11 PCR cycles CT2 = 8 PCR cycles

datadir = "/rds/general/user/la420/projects/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/CT1_CT2/alignment/sam/bowtie2_summary/"
sampleList = c("ActiveMotif_Henikoff_8PCR", "Diagenode_50x_Henikoff_8PCR", "Abcam-ab177178_Henikoff_8PCR", "Abcam-ab4729_Henikoff_8PCR", "H3K27me3_Henikoff_8PCR", "ActiveMotif_AM_8PCR", "Diagenode_50x_AM_8PCR", "Abcam-ab177178_AM_8PCR", "Abcam-ab4729_AM_8PCR", "H3K27me3_AM_8PCR", "ActiveMotif_Henikoff_11PCR", "Diagenode_50x_Henikoff_11PCR", "Abcam-ab177178_Henikoff_11PCR", "Abcam-ab4729_Henikoff_11PCR", "H3K27me3_Henikoff_11PCR", "ActiveMotif_AM_11PCR", "Diagenode_50x_AM_11PCR", "Abcam-ab177178_AM_11PCR", "Abcam-ab4729_AM_11PCR", "H3K27me3_AM_11PCR")

## collect the alignment results from the bowtie2 alignment summary files
alignResult = c()
for(sample in sampleList){
  alignRes = read.table(paste0(datadir, sample, "_trimmed_bowtie2.txt"), header = FALSE, fill = TRUE)
  alignRate = substr(alignRes$V1[6], 1, nchar(as.character(alignRes$V1[6]))-1)
  alignResult = data.frame(Sample = sample, SequencingDepth = alignRes$V1[1] %>% as.character %>% as.numeric, 
                           MappedFragNum_hg19 = alignRes$V1[4] %>% as.character %>% as.numeric + alignRes$V1[5] %>% as.character %>% as.numeric,
                           AlignmentRate_hg19 = alignRate %>% as.numeric) %>% rbind(alignResult, .)
}


alignResult %>% mutate(AlignmentRate_hg19 = paste0(AlignmentRate_hg19, "%"))

Duplicates

## summarize the duplication information from the picard summary outputs
picarddir = "/rds/general/user/la420/projects/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/CT1_CT2/removeDuplicate/picard_summary/"

dupResult = c()
for(sample in sampleList){
  dupRes = read.table(paste0(picarddir, sample, "_picard.rmDup.txt"), header = TRUE, fill = TRUE)
  dupResult = data.frame(Sample = sample, MappedFragNum_hg19 = dupRes$READ_PAIRS_EXAMINED[1] %>% as.character %>% as.numeric, DuplicationRate = dupRes$PERCENT_DUPLICATION[1] %>% as.character %>% as.numeric * 100) %>% mutate(UniqueFragNum = (MappedFragNum_hg19 * (1-DuplicationRate/100)) %>% round()) %>% rbind(dupResult, .)
}

alignDupSummary = left_join(alignResult, dupResult, by = c("Sample", "MappedFragNum_hg19")) %>% mutate(DuplicationRate = paste0(DuplicationRate, "%")) %>% mutate(AlignmentRate_hg19 = paste0(AlignmentRate_hg19, "%"))
alignDupSummary

Fragment sizes

8 PCR cycles

## collect the fragment size information
fragdir = "/rds/general/user/la420/projects/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/CT1_CT2/alignment/sam/fragmentLen/"

fragLen = c()
for(sample in sampleList[1:10]){
  fragLen = read.table(paste0(fragdir, sample, "_rmDup_fragmentLen.txt"), header = FALSE) %>% mutate(fragLen = V1 %>% as.numeric, fragCount = V2 %>% as.numeric, Weight = as.numeric(V2)/sum(as.numeric(V2)), Sample = sample) %>% rbind(fragLen, .) 
}

fig1 = fragLen %>% ggplot(aes(x = Sample, y = fragLen, weight = Weight, fill = Sample)) +
    geom_violin(bw = 5) +
    scale_y_continuous(breaks = seq(0, 800, 50)) +
    scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
    scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
    theme_bw(base_size = 20) +
    ggpubr::rotate_x_text(angle = 20) +
    ylab("Fragment Length") +
    xlab("") +
    theme(plot.margin = unit(c(0.5,0,0,3), "cm"))
  
fig2 = fragLen %>% ggplot(aes(x = fragLen, y = fragCount, color = Sample, group = Sample)) +
  geom_line(size = 1) +
  scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma") +
  theme_bw(base_size = 20) +
  xlab("Fragment Length") +
  ylab("Count") +
  coord_cartesian(xlim = c(0, 500))

ggarrange(fig1, fig2, ncol = 2)

11 PCR cycles

fragdir = "/rds/general/user/la420/projects/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/CT1_CT2/alignment/sam/fragmentLen/"

fragLen = c()
for(sample in sampleList[11:20]){
  fragLen = read.table(paste0(fragdir, sample, "_rmDup_fragmentLen.txt"), header = FALSE) %>% mutate(fragLen = V1 %>% as.numeric, fragCount = V2 %>% as.numeric, Weight = as.numeric(V2)/sum(as.numeric(V2)), Sample = sample) %>% rbind(fragLen, .) 
}

fig3 = fragLen %>% ggplot(aes(x = Sample, y = fragLen, weight = Weight, fill = Sample)) +
    geom_violin(bw = 5) +
    scale_y_continuous(breaks = seq(0, 800, 50)) +
    scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "viridis", alpha = 0.8) +
    scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
    theme_bw(base_size = 20) +
    ggpubr::rotate_x_text(angle = 20) +
    ylab("Fragment Length") +
    xlab("") +
    theme(plot.margin = unit(c(0.5,0,0,2), "cm"))

fig4 = fragLen %>% ggplot(aes(x = fragLen, y = fragCount, color = Sample, group = Sample)) +
  geom_line(size = 1) +
  scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "viridis") +
  theme_bw(base_size = 20) +
  xlab("Fragment Length") +
  ylab("Count") +
  coord_cartesian(xlim = c(0, 500))

ggarrange(fig3, fig4, ncol = 2)

Peak information

peakdir = "/rds/general/user/la420/projects/neurogenomics-lab/live/GitRepos/CUT_n_TAG/processed_data/CT1_CT2/peakCalling/SEACR/"

hg19_blacklist = ChIPseeker::readPeakFile("/rds/general/user/la420/home/hg19/other/hg19-blacklist.v2.bed.gz", as = "GRanges")

blacklist_counts = c()
peakList = list()

for(sample in sampleList){
  path = paste0(peakdir, sample, "_seacr_top0.01.peaks.stringent.bed")
  peak = ChIPseeker::readPeakFile(file.path(path), as = "GRanges") %>% BRGenomics::tidyChromosomes(keep.X = TRUE, keep.Y = TRUE)
  blacklist_count = length(IRanges::subsetByOverlaps(peak, hg19_blacklist))
  peak = IRanges::subsetByOverlaps(peak, hg19_blacklist, invert = TRUE)
  peakList = c(peakList, peak)
  blacklist_counts = c(blacklist_counts, blacklist_count)
}

peakList = setNames(peakList, sampleList)

Peak numbers

#extraCols_narrowPeak = c(signalValue = "numeric", pValue = "numeric", qValue = "numeric", peak = "integer")

#ENCODE_H3K27ac = rtracklayer::import("https://www.encodeproject.org/files/ENCFF044JNJ/@@download/ENCFF044JNJ.bed.gz", format = "BED", extraCols = extraCols_narrowPeak)
ENCODE_H3K27ac = ChIPseeker::readPeakFile("/rds/general/user/la420/home/CUTnTAG/antibodies/all_peaks/ENCODE/H3K27ac_ENCFF044JNJ.bed.narrowPeak", as = "GRanges")

#ENCODE_H3K27me3 = rtracklayer::import("https://www.encodeproject.org/files/ENCFF126QYP/@@download/ENCFF126QYP.bed.gz", format = "BED", extraCols = extraCols_narrowPeak)
ENCODE_H3K27me3 = ChIPseeker::readPeakFile("/rds/general/user/la420/home/CUTnTAG/antibodies/all_peaks/ENCODE/H3K27me3_ENCFF126QYP.bed.narrowPeak", as = "GRanges")

peakList_with_ENCODE = c(peakList, ENCODE_H3K27ac, ENCODE_H3K27me3)
sampleList_with_ENCODE = c(sampleList, "ENCODE_H3K27ac", "ENCODE_H3K27me3")
peakList_with_ENCODE = setNames(peakList_with_ENCODE, sampleList_with_ENCODE)
Total_CT_peaks = c()
for(i in 1:length(peakList)){
  peakN = length(peakList[[i]])
  Total_CT_peaks = c(Total_CT_peaks, peakN)
}

Total_CT_peaks = data.frame(Sample = sampleList, Total_PeakN = Total_CT_peaks)
Total_CT_peaks
PeakN_summary_melt = melt(PeakN_summary, id.vars = c("Sample"))

PeakN_plot = ggplot(PeakN_summary_melt, aes(Sample, value)) +
  geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
  theme_light() +
  xlab("Sample") +
  ylab("") +
  ggtitle("Peaks called") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(axis.text = element_text(size = 15)) +
  theme(axis.title = element_text(size = 20)) +
  theme(plot.title = element_text(size = 25)) +
  theme(legend.text = element_text(size = 15)) +
  theme(legend.title = element_text(size = 15)) +
  scale_fill_viridis(name = "Condition", labels = c("Henikoff 8PCR", "AM 8PCR", "Henikoff 11PCR", "AM 11PCR"), discrete = TRUE, begin = 0.1, end = 0.8, option = "viridis")

PeakN_plot

Blacklisted peaks

blacklisted = data.frame(Sample = sampleList, Total_peaks = Total_CT_peaks$Total_PeakN, Blacklisted_peaks = blacklist_counts)
blacklisted$Proportion = blacklisted$Blacklisted_peaks / blacklisted$Total_peaks * 100
blacklisted

Peak widths

peakWidth = c()
for(peak in peakList){
  peakW = GenomicRanges::width(peak) %>% summary() %>% round() %>% list()
  peakWidth = c(peakWidth, peakW)
}

WSummary = c()
for(i in 1:length(peakWidth)){
  WSummary = data.frame(Sample = sampleList[i], MinW = array(peakWidth[[i]][1]), MedianW = array(peakWidth[[i]][3]), MeanW = array(peakWidth[[i]][4]), MaxW = array(peakWidth[[i]][6])) %>% rbind(., WSummary)
}

WSummary

ENCODE overlap

C&T peaks in ENCODE

ENCODE_H3K27ac_overlapping_CT_peaks = c()
ENCODE_H3K27me3_overlapping_CT_peaks = c()

for(i in 1:length(peakList)){
  overlaps_H3K27ac = GenomicRanges::countOverlaps(query = peakList[[i]], subject = ENCODE_H3K27ac)
  overlaps_H3K27me3 = GenomicRanges::countOverlaps(query = peakList[[i]], subject = ENCODE_H3K27me3)
  CT_in_ENCODE_H3K27ac = overlaps_H3K27ac[overlaps_H3K27ac != 0] %>% length()
  CT_in_ENCODE_H3K27me3 = overlaps_H3K27me3[overlaps_H3K27me3 != 0] %>% length()
  ENCODE_H3K27ac_overlapping_CT_peaks = c(ENCODE_H3K27ac_overlapping_CT_peaks, CT_in_ENCODE_H3K27ac)
  ENCODE_H3K27me3_overlapping_CT_peaks = c(ENCODE_H3K27me3_overlapping_CT_peaks, CT_in_ENCODE_H3K27me3)
}

CT_in_ENCODE = data.frame(Sample = sampleList, CT_peaks_in_ENCODE_H3K27ac = ENCODE_H3K27ac_overlapping_CT_peaks, CT_peaks_in_ENCODE_H3K27me3 = ENCODE_H3K27me3_overlapping_CT_peaks)
CT_in_ENCODE

ENCODE peaks in C&T

captured_H3K27ac_ENCODE_list = c()
captured_H3K27me3_ENCODE_list = c()

for(i in 1:length(peakList)){
  overlaps_H3K27ac = GenomicRanges::countOverlaps(query = ENCODE_H3K27ac, subject = peakList[[i]])
  overlaps_H3K27me3 = GenomicRanges::countOverlaps(query = ENCODE_H3K27me3, subject = peakList[[i]])
  ENCODE_H3K27ac_captured_peaks = overlaps_H3K27ac[overlaps_H3K27ac != 0] %>% length()
  ENCODE_H3K27me3_captured_peaks = overlaps_H3K27me3[overlaps_H3K27me3 != 0] %>% length()
  captured_H3K27ac_ENCODE_list = c(captured_H3K27ac_ENCODE_list, ENCODE_H3K27ac_captured_peaks)
  captured_H3K27me3_ENCODE_list = c(captured_H3K27me3_ENCODE_list, ENCODE_H3K27me3_captured_peaks)
}

Total_captured_ENCODE_peaks = data.frame(Sample = sampleList, ENCODE_H3K27ac_captured_peaks = captured_H3K27ac_ENCODE_list, ENCODE_H3K27me3_captured_peaks = captured_H3K27me3_ENCODE_list)
Total_captured_ENCODE_peaks

Summarize

ENCODE_overlaps = Total_captured_ENCODE_peaks %>% left_join(Total_CT_peaks, by = "Sample") %>% left_join(CT_in_ENCODE, by = "Sample")

ENCODE_overlaps$Percentage_of_CT_in_ENCODE_H3K27ac = ENCODE_overlaps$CT_peaks_in_ENCODE_H3K27ac / ENCODE_overlaps$Total_PeakN * 100
ENCODE_overlaps$Percentage_of_total_ENCODE_H3K27ac = ENCODE_overlaps$ENCODE_H3K27ac_captured_peaks / length(ENCODE_H3K27ac) * 100

ENCODE_overlaps$Percentage_of_CT_in_ENCODE_H3K27me3 = ENCODE_overlaps$CT_peaks_in_ENCODE_H3K27me3 / ENCODE_overlaps$Total_PeakN * 100
ENCODE_overlaps$Percentage_of_total_ENCODE_H3K27me3 = ENCODE_overlaps$ENCODE_H3K27me3_captured_peaks / length(ENCODE_H3K27me3) * 100

ENCODE_overlaps_H3K27ac = ENCODE_overlaps[c("Sample", "Total_PeakN", "CT_peaks_in_ENCODE_H3K27ac", "Percentage_of_CT_in_ENCODE_H3K27ac", "ENCODE_H3K27ac_captured_peaks", "Percentage_of_total_ENCODE_H3K27ac")]
ENCODE_overlaps_H3K27ac
ENCODE_overlaps_H3K27me3 = ENCODE_overlaps[c("Sample", "Total_PeakN", "CT_peaks_in_ENCODE_H3K27me3", "Percentage_of_CT_in_ENCODE_H3K27me3", "ENCODE_H3K27me3_captured_peaks", "Percentage_of_total_ENCODE_H3K27me3")]
ENCODE_overlaps_H3K27me3
Percentage_of_CT_in_ENCODE_H3K27ac_summary_melt = melt(Percentage_of_CT_in_ENCODE_H3K27ac_summary, id.vars = "Sample")

Peaks_in_ENCODE_plot = ggplot(Percentage_of_CT_in_ENCODE_H3K27ac_summary_melt, aes(Sample, value)) +
  geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
  ylim(0, 100) +
  theme_light() +
  ylab("% of sample peaks in ENCODE") +
  xlab("Sample") +
  ggtitle("Proportion of CUT&Tag peaks falling into ENCODE H3K27ac") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(axis.text = element_text(size = 15)) +
  theme(axis.title = element_text(size = 20)) +
  theme(plot.title = element_text(size = 25)) +
  theme(legend.text = element_text(size = 15)) +
  theme(legend.title = element_text(size = 15)) +
  scale_fill_viridis(name = "Condition", labels = c("Henikoff 8PCR", "AM 8PCR", "Henikoff 11PCR", "AM 11PCR"), discrete = TRUE, begin = 0.1, end = 0.8, option = "viridis")

Peaks_in_ENCODE_plot

Percentage_of_total_ENCODE_H3K27ac_summary_melt = melt(Percentage_of_total_ENCODE_H3K27ac_summary, id.vars = "Sample")

Peaks_in_ENCODE_plot = ggplot(Percentage_of_total_ENCODE_H3K27ac_summary_melt, aes(Sample, value)) +
  geom_bar(aes(fill = variable), width = 0.7, position = position_dodge(width = 0.75), stat = "identity", alpha = 0.7) +
  ylim(0, 100) +
  theme_light() +
  ylab("% of ENCODE peaks captured") +
  xlab("Sample") +
  ggtitle("Proportion of ENCODE H3K27ac peaks captured by CUT&Tag") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme(axis.text = element_text(size = 15)) +
  theme(axis.title = element_text(size = 20)) +
  theme(plot.title = element_text(size = 25)) +
  theme(legend.text = element_text(size = 15)) +
  theme(legend.title = element_text(size = 15)) +
  scale_fill_viridis(name = "Condition", labels = c("Henikoff 8PCR", "AM 8PCR", "Henikoff 11PCR", "AM 11PCR"), discrete = TRUE, begin = 0.1, end = 0.8, option = "viridis")
Peaks_in_ENCODE_plot

ChromHMM

chrHMM_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeBroadHmm/wgEncodeBroadHmmK562HMM.bed.gz"
chrHMM = genomation::readBed(chrHMM_url)
chrHMM_list = GenomicRanges::split(chrHMM, chrHMM$name, drop = TRUE) 
annotation = genomation::annotateWithFeatures(GenomicRanges::GRangesList(peakList_with_ENCODE), chrHMM_list)
genomation::heatTargetAnnotation(annotation, col = c("white", "darkorchid4"))